# 博客地址  https://www.yuanrenxue.com/crawler/news-crawler-downloader.html

import requests
import cchardet  # 识别编码格式最好的库
import traceback  # 提示报错的工具，逐行提示


def downloader(url, timeout=10, headers=None, debug=False, binary=False):
    _headers = {
        'User-Agent': ('Mozilla/5.0 (compatible; MSIE 9.0; '
                       'Windows NT 6.1; Win64; x64; Trident/5.0)'),
    }
    redirected_url = url
    if headers:
        _headers = headers
    try:
        r = requests.get(url, headers=_headers, timeout=timeout)
        if binary:
            html = r.content
        else:
            encoding = cchardet.detect(r.content)['encoding']
            html = r.content.decode(encoding)
        status = r.status_code
        redirected_url = r.url  # 真实的url  因为可能被重定向了
    except:
        if debug:
            traceback.print_exc()
        msg = 'failed download: {}'.format(url)
        print(msg)
        if binary:
            html = b''
        else:
            html = ''
        status = 0
    return status, html, redirected_url


if __name__ == '__main__':
    url = 'http://news.baidu.com/'
    s, html, lost_url_found_by_大大派 = downloader(url)
    print(s, len(html), lost_url_found_by_大大派)
